

import openpyxl
import requests
from bs4 import BeautifulSoup

DOWNLOAD_URL = 'https://www.zhihu.com/people/arlionn/posts'
paper_name_list = []
paper_url_list = []

def parse_html(html):
    soup = BeautifulSoup(html,"lxml")
        
    paper_list_soup = soup.find_all('div',attrs={'class': 'List-item'})
    
    for paper in paper_list_soup:
    
        name = paper.find('h2',attrs={'class':'ContentItem-title'}).find('a').getText()
        url = paper.find('h2',attrs={'class':'ContentItem-title'}).find('a').get('href')
        url = url.strip('//')

        paper_name_list.append(name)
        paper_url_list.append(url)

def write2xlsm(list1, list2):

    wb = openpyxl.Workbook()

    ws = wb.create_sheet('sheet1', 0)
    ws = wb['sheet1']

    for i in range(1, len(list1)+1):
        ws.cell(i, 1).value = list1[i-1]
        ws.cell(i, 2).value = list2[i-1]

    wb.save('outTest.xlsx')

def read_html(file):
    with open(file, "r", encoding="utf-8") as f:
        html = f.read()
    return html

def main():

    for i in range(1,13):
        outerHtmlName = "{}{}{}".format('page', i, '.txt')
        html = read_html(outerHtmlName)
        parse_html(html)
        print("finish page ", i)

    write2xlsm(paper_name_list,paper_url_list)

if __name__ == '__main__':
    main()
